TODO
library(dplyr)
library(ggplot2)
library(tidyr)
library(plotly)
library(knitr)
library(caret)
library(randomForest)
data <- read.csv("mp_batteries.csv")
columns <- names(data)
string_columns <- c("Battery.Formula", "Working.Ion", "Formula.Charge", "Formula.Discharge")
numeric_columns <- setdiff(columns, c(string_columns, "Battery.ID"))
Liczba wierszy: 4351.
Podsumowanie:
kable(summary(data))
| Battery.ID | Battery.Formula | Working.Ion | Formula.Charge | Formula.Discharge | Max.Delta.Volume | Average.Voltage | Gravimetric.Capacity | Volumetric.Capacity | Gravimetric.Energy | Volumetric.Energy | Atomic.Fraction.Charge | Atomic.Fraction.Discharge | Stability.Charge | Stability.Discharge | Steps | Max.Voltage.Step | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Length:4351 | Length:4351 | Length:4351 | Length:4351 | Length:4351 | Min. : 0.00002 | Min. :-7.755 | Min. : 5.176 | Min. : 24.08 | Min. :-583.5 | Min. :-2208.1 | Min. :0.00000 | Min. :0.007407 | Min. :0.00000 | Min. :0.00000 | Min. :1.000 | Min. : 0.0000 | |
| Class :character | Class :character | Class :character | Class :character | Class :character | 1st Qu.: 0.01747 | 1st Qu.: 2.226 | 1st Qu.: 88.108 | 1st Qu.: 311.62 | 1st Qu.: 211.7 | 1st Qu.: 821.6 | 1st Qu.:0.00000 | 1st Qu.:0.086957 | 1st Qu.:0.03301 | 1st Qu.:0.01952 | 1st Qu.:1.000 | 1st Qu.: 0.0000 | |
| Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Median : 0.04203 | Median : 3.301 | Median : 130.691 | Median : 507.03 | Median : 401.8 | Median : 1463.8 | Median :0.00000 | Median :0.142857 | Median :0.07319 | Median :0.04878 | Median :1.000 | Median : 0.0000 | |
| NA | NA | NA | NA | NA | Mean : 0.37531 | Mean : 3.083 | Mean : 158.291 | Mean : 610.62 | Mean : 444.1 | Mean : 1664.0 | Mean :0.03986 | Mean :0.159077 | Mean :0.14257 | Mean :0.12207 | Mean :1.167 | Mean : 0.1503 | |
| NA | NA | NA | NA | NA | 3rd Qu.: 0.08595 | 3rd Qu.: 4.019 | 3rd Qu.: 187.600 | 3rd Qu.: 722.75 | 3rd Qu.: 614.4 | 3rd Qu.: 2252.3 | 3rd Qu.:0.04762 | 3rd Qu.:0.200000 | 3rd Qu.:0.13160 | 3rd Qu.:0.09299 | 3rd Qu.:1.000 | 3rd Qu.: 0.0000 | |
| NA | NA | NA | NA | NA | Max. :293.19322 | Max. :54.569 | Max. :2557.627 | Max. :7619.19 | Max. :5926.9 | Max. :18305.9 | Max. :0.90909 | Max. :0.993333 | Max. :6.48710 | Max. :6.27781 | Max. :6.000 | Max. :26.9607 |
counts <- data.frame()
for (col in c(string_columns, "Battery.ID"))
{
col_counts <- data %>%
count(get(col), name = "Count") %>%
rename(Value = "get(col)") %>%
arrange(desc(Count))
col_counts$var <- col
counts <- rbind(counts, col_counts)
top_10_counts <- col_counts %>%
select(c("Value", "Count")) %>%
slice(1:10)
print(kable(top_10_counts, caption = paste("10 najliczniej występujących wartości zmiennej", col)))
cat("\n")
}
| Value | Count |
|---|---|
| Li0-1V2OF5 | 19 |
| Li0-1CoPO4 | 18 |
| Li0-1FePO4 | 18 |
| Li0-3MnFeCo(PO4)3 | 17 |
| Li0-1MnPO4 | 15 |
| Li0-1V4OF11 | 15 |
| Li0-1V4O5F7 | 12 |
| Li0-1VF5 | 12 |
| Li0-1CrP2O7 | 11 |
| Li0-2MnP2O7 | 11 |
| Value | Count |
|---|---|
| Li | 2440 |
| Ca | 435 |
| Mg | 423 |
| Zn | 366 |
| Na | 309 |
| K | 107 |
| Al | 95 |
| Y | 93 |
| Rb | 50 |
| Cs | 33 |
| Value | Count |
|---|---|
| MnO2 | 49 |
| TiO2 | 47 |
| VO2 | 46 |
| CrO2 | 45 |
| CoO2 | 43 |
| NiO2 | 41 |
| FeO2 | 36 |
| FePO4 | 26 |
| WO2 | 25 |
| CoPO4 | 24 |
| Value | Count |
|---|---|
| LiCoPO4 | 19 |
| LiFePO4 | 19 |
| LiMnPO4 | 19 |
| LiV2OF5 | 19 |
| Li5Mn6(BO3)6 | 18 |
| Li3MnFeCo(PO4)3 | 17 |
| LiV4OF11 | 15 |
| Li2MnP2O7 | 14 |
| Li2FeSiO4 | 13 |
| LiCrPO4 | 12 |
| Value | Count |
|---|---|
| mp-1001925_Mg | 1 |
| mp-1003319_Ca | 1 |
| mp-10033_Cs | 1 |
| mp-10033_Rb | 1 |
| mp-1008911_Li | 1 |
| mp-1009555_Li | 1 |
| mp-1009747_Li | 1 |
| mp-1009747_Na | 1 |
| mp-1012668_Li | 1 |
| mp-1012678_Na | 1 |
ggplot(counts, aes(x = Count)) +
geom_histogram(binwidth = 1, fill = "green", alpha = 0.7) +
labs(
title = paste("Liczba wystąpień wartości dla zmiennej"),
x = "Liczba wystąpień",
y = "Liczba różnych wartości"
) +
facet_wrap(~var, scales="free") +
theme_minimal()
Battery.ID to idenryfikator baterii. Jest on unikalny w zbiorze. Zmienne Battery.Formula, Formula.Charge i Formula.Discharge cechują się dużą liczbą różnych wartości. Oznacza to że testowane jest wiele różnych możliwych subastancji, które mogą zostać użyte do produkcji baterii. Inaczej jest w przypadku zmiennej Working.Ion. Ponad połowa materiałów jako głównego jonu używa litu. Wydaje się to być dominujący trend w badaniach nad bateriami.
nan_counts <- sapply(numeric_columns, function(col) sum(is.nan(data[[col]])))
nan_counts_df <- data.frame(
nan = nan_counts
)
print(kable(nan_counts_df))
| nan | |
|---|---|
| Max.Delta.Volume | 0 |
| Average.Voltage | 0 |
| Gravimetric.Capacity | 0 |
| Volumetric.Capacity | 0 |
| Gravimetric.Energy | 0 |
| Volumetric.Energy | 0 |
| Atomic.Fraction.Charge | 0 |
| Atomic.Fraction.Discharge | 0 |
| Stability.Charge | 0 |
| Stability.Discharge | 0 |
| Steps | 0 |
| Max.Voltage.Step | 0 |
numeric_df <- data[, numeric_columns]
numeric_df_long <- numeric_df %>%
pivot_longer(colnames(numeric_df)) %>%
as.data.frame()
ggplot(numeric_df_long, aes(x = value)) +
geom_histogram(fill = "green", alpha = 0.7) +
facet_wrap(~ name, scales = "free") +
theme_minimal()
W zbiorze danych nie brakuje żadnych wartości. Rozkłady większości zmiennych mają wyraźnie zaznaczony najczęściej występujący przedział wartości. Z reguły jest on dość wąski w stosunku do całej dziedziny.
numeric_df <- data[, numeric_columns]
correlation_matrix <- cor(numeric_df)
correlation_df <- as.data.frame(as.table(correlation_matrix))
names(correlation_df) <- c("x", "y", "cor")
correlation_df_one_dir <- correlation_df[as.character(correlation_df$x) < as.character(correlation_df$y), ]
kable(correlation_df_one_dir[order(-abs(correlation_df_one_dir$cor)), ])
| x | y | cor | |
|---|---|---|---|
| 65 | Gravimetric.Energy | Volumetric.Energy | 0.9283253 |
| 39 | Gravimetric.Capacity | Volumetric.Capacity | 0.8584163 |
| 117 | Stability.Charge | Stability.Discharge | 0.8028701 |
| 32 | Atomic.Fraction.Discharge | Gravimetric.Capacity | 0.6807716 |
| 50 | Average.Voltage | Gravimetric.Energy | 0.6656523 |
| 44 | Atomic.Fraction.Discharge | Volumetric.Capacity | 0.6180186 |
| 91 | Atomic.Fraction.Charge | Atomic.Fraction.Discharge | 0.5974157 |
| 62 | Average.Voltage | Volumetric.Energy | 0.5545191 |
| 132 | Max.Voltage.Step | Steps | 0.5352539 |
| 3 | Gravimetric.Capacity | Max.Delta.Volume | 0.4337733 |
| 137 | Gravimetric.Energy | Max.Voltage.Step | 0.3292322 |
| 64 | Volumetric.Capacity | Volumetric.Energy | 0.3257482 |
| 125 | Gravimetric.Energy | Steps | 0.2946075 |
| 8 | Atomic.Fraction.Discharge | Max.Delta.Volume | 0.2906921 |
| 72 | Max.Voltage.Step | Volumetric.Energy | 0.2526625 |
| 37 | Max.Delta.Volume | Volumetric.Capacity | 0.2424769 |
| 71 | Steps | Volumetric.Energy | 0.2381420 |
| 63 | Gravimetric.Capacity | Volumetric.Energy | 0.2304216 |
| 51 | Gravimetric.Capacity | Gravimetric.Energy | 0.2132463 |
| 38 | Average.Voltage | Volumetric.Capacity | -0.2128178 |
| 41 | Gravimetric.Energy | Volumetric.Capacity | 0.2098406 |
| 69 | Stability.Charge | Volumetric.Energy | 0.1783271 |
| 20 | Atomic.Fraction.Discharge | Average.Voltage | -0.1716903 |
| 101 | Gravimetric.Energy | Stability.Charge | 0.1669819 |
| 98 | Average.Voltage | Stability.Charge | 0.1661371 |
| 128 | Atomic.Fraction.Discharge | Steps | 0.1641713 |
| 67 | Atomic.Fraction.Charge | Volumetric.Energy | -0.1473523 |
| 26 | Average.Voltage | Gravimetric.Capacity | -0.1462222 |
| 123 | Gravimetric.Capacity | Steps | 0.1333977 |
| 31 | Atomic.Fraction.Charge | Gravimetric.Capacity | 0.1289210 |
| 110 | Average.Voltage | Stability.Discharge | -0.1284568 |
| 134 | Average.Voltage | Max.Voltage.Step | 0.1271208 |
| 47 | Steps | Volumetric.Capacity | 0.1037051 |
| 140 | Atomic.Fraction.Discharge | Max.Voltage.Step | 0.1019796 |
| 45 | Stability.Charge | Volumetric.Capacity | 0.1015305 |
| 55 | Atomic.Fraction.Charge | Gravimetric.Energy | -0.0972924 |
| 135 | Gravimetric.Capacity | Max.Voltage.Step | 0.0951906 |
| 108 | Max.Voltage.Step | Stability.Charge | 0.0940466 |
| 2 | Average.Voltage | Max.Delta.Volume | -0.0823707 |
| 113 | Gravimetric.Energy | Stability.Discharge | -0.0782609 |
| 56 | Atomic.Fraction.Discharge | Gravimetric.Energy | 0.0645248 |
| 99 | Gravimetric.Capacity | Stability.Charge | 0.0633871 |
| 130 | Stability.Discharge | Steps | -0.0631686 |
| 122 | Average.Voltage | Steps | 0.0627851 |
| 48 | Max.Voltage.Step | Volumetric.Capacity | 0.0626085 |
| 68 | Atomic.Fraction.Discharge | Volumetric.Energy | 0.0610586 |
| 5 | Gravimetric.Energy | Max.Delta.Volume | -0.0609858 |
| 70 | Stability.Discharge | Volumetric.Energy | -0.0599949 |
| 61 | Max.Delta.Volume | Volumetric.Energy | -0.0588321 |
| 115 | Atomic.Fraction.Charge | Stability.Discharge | -0.0523971 |
| 19 | Atomic.Fraction.Charge | Average.Voltage | -0.0385556 |
| 129 | Stability.Charge | Steps | -0.0374860 |
| 97 | Max.Delta.Volume | Stability.Charge | 0.0337587 |
| 104 | Atomic.Fraction.Discharge | Stability.Charge | 0.0324051 |
| 46 | Stability.Discharge | Volumetric.Capacity | 0.0317012 |
| 127 | Atomic.Fraction.Charge | Steps | 0.0297369 |
| 103 | Atomic.Fraction.Charge | Stability.Charge | -0.0273571 |
| 7 | Atomic.Fraction.Charge | Max.Delta.Volume | 0.0213153 |
| 120 | Max.Voltage.Step | Stability.Discharge | -0.0165552 |
| 116 | Atomic.Fraction.Discharge | Stability.Discharge | 0.0143204 |
| 121 | Max.Delta.Volume | Steps | -0.0132582 |
| 111 | Gravimetric.Capacity | Stability.Discharge | 0.0125390 |
| 133 | Max.Delta.Volume | Max.Voltage.Step | -0.0099251 |
| 109 | Max.Delta.Volume | Stability.Discharge | 0.0077357 |
| 139 | Atomic.Fraction.Charge | Max.Voltage.Step | 0.0053420 |
| 43 | Atomic.Fraction.Charge | Volumetric.Capacity | 0.0012456 |
p <- ggplot(correlation_df) +
geom_tile(aes(x = x, y = y, fill = abs(cor), text = paste("Korelacja pomiędzy", x, "i", y, "=", abs(cor)))) +
theme(axis.title = element_blank()) +
labs(fill="Korelacja") +
scale_fill_gradient(low="white", high="green") +
theme_minimal()
ggplotly(p, tooltip = "text") %>%
layout(
xaxis = list(
tickangle = 45,
title = ""
),
yaxis = list(
title = ""
)
)
top_5_correlation <- correlation_df_one_dir[order(-abs(correlation_df_one_dir$cor)), ] %>%
slice(1:5)
kable(top_5_correlation)
| x | y | cor |
|---|---|---|
| Gravimetric.Energy | Volumetric.Energy | 0.9283253 |
| Gravimetric.Capacity | Volumetric.Capacity | 0.8584163 |
| Stability.Charge | Stability.Discharge | 0.8028701 |
| Atomic.Fraction.Discharge | Gravimetric.Capacity | 0.6807716 |
| Average.Voltage | Gravimetric.Energy | 0.6656523 |
ggplotly(
ggplot(data, aes(x = Gravimetric.Energy, y = Volumetric.Energy)) +
geom_point(aes(
x = Gravimetric.Energy,
y = Volumetric.Energy,
text = paste("ID baterii:", Battery.ID,
"\nGravimetric.Energy:",Gravimetric.Energy,
"\nVolumetric.Energy", Volumetric.Energy
)
)) +
geom_smooth(method = lm) +
labs(title = paste("Gravimetric.Energy i Volumetric.Energy")) +
theme_minimal(),
tooltip = "text"
)
ggplotly(
ggplot(data, aes(x = Gravimetric.Capacity, y = Volumetric.Capacity)) +
geom_point(aes(
x = Gravimetric.Capacity,
y = Volumetric.Capacity,
text = paste(
"ID baterii:", Battery.ID,
"\nGravimetric.Capacity:", Gravimetric.Capacity,
"\nVolumetric.Capacity:", Volumetric.Capacity
)
)) +
geom_smooth(method = lm) +
labs(title = paste("Gravimetric.Capacity i Volumetric.Capacity")) +
theme_minimal(),
tooltip = "text"
)
ggplotly(
ggplot(data, aes(x = Stability.Charge, y = Stability.Discharge)) +
geom_point(aes(
x = Stability.Charge,
y = Stability.Discharge,
text = paste(
"ID baterii:", Battery.ID,
"\nStability.Charge:", Stability.Charge,
"\nStability.Discharge:", Stability.Discharge
)
)) +
geom_smooth(method = lm) +
labs(title = paste("Stability.Charge i Stability.Discharge")) +
theme_minimal(),
tooltip = "text"
)
ggplotly(
ggplot(data, aes(x = Atomic.Fraction.Discharge, y = Gravimetric.Capacity)) +
geom_point(aes(
x = Atomic.Fraction.Discharge,
y = Gravimetric.Capacity,
text = paste(
"ID baterii:", Battery.ID,
"\nAtomic.Fraction.Discharge:", Atomic.Fraction.Discharge,
"\nGravimetric.Capacity:", Gravimetric.Capacity
)
)) +
geom_smooth(method = lm) +
labs(title = paste("Atomic.Fraction.Discharge i Gravimetric.Capacity")) +
theme_minimal(),
tooltip = "text"
)
ggplotly(
ggplot(data, aes(x = Average.Voltage, y = Gravimetric.Energy)) +
geom_point(aes(
x = Average.Voltage,
y = Gravimetric.Energy,
text = paste(
"ID baterii:", Battery.ID,
"\nAverage.Voltage:", Average.Voltage,
"\nGravimetric.Energy:", Gravimetric.Energy
)
)) +
geom_smooth(method = lm) +
labs(title = paste("Average.Voltage i Gravimetric.Energy")) +
theme_minimal(),
tooltip = "text"
)
Wysoka korelacja zmiennych Gravimetric.Energy i Volumetric.Energy, oraz Gravimetric.Capacity i Volumetric.Capacity wydaje się logiczna, ponieważ te pary zmiennych powiązane są z gęstością enegrii i pojemności baterii. Dla pary Stability.Charge i Stability.Discharge, wynik również jest zgodny z oczekiwaniami. Stabilość substancji w jednym stanie może być związana z jej stabilością w innym. Dwie pozostałe pary zmiennych o wysokiej korelacji (dla których ta korelacja jest już znacznie niższa) wydają się bardziej interesujące z punktu widzenia badań nad nowymi materiałami.
Najważniejszym trendem w badaniach nad bateriami wydaje się być zastosowanie litu jako głównego jonu odpowiadającego za transport ładunku. Innymi pierwiastaki często stosowanymi w tym celu są wapń i magnez, natomiast jest to znacznie rzadsze.
print(sum(grepl("O", data$Formula.Charge, ignore.case = FALSE)))
## [1] 3824
print(sum(grepl("O", data$Formula.Discharge, ignore.case = FALSE)))
## [1] 3824
Innym pierwiastkiem często używanym w materiałach do produkcji baterii jest tlen. 3848 materiałów ze zbioru zawiera go w stanie naładowanym i rozładowanym.
Ważnym celem w badaniach nad bateriami jest zwiększanie pojemości oraz gęstości energii. Na bazie uzyskanych wartości korelacji pomiędzy zmiennymi wydaje się że badania nad zwiększeniem udziału atomowego składników w stanie rozładowanym mogą mieć pozytywne skutki dla pojemności, a nad średnim napięciem dla energii.
W celu predykcji wartości Volumetric.Capacity nowych materiałów zastosowano regresor wykorzystujący algorytm Random Forest. Z danych uczących usunięto kolumny tekstowe z wyjątkiem Working.Ion, z uwagi na ich niską informatywność dla regresora - bardzo dużo liczba wartości kategorycznych. Usnięto również zmienną Gravimetric.Capacity - wysoka korelacja z Volumetric.Capacity sprawia że predykcja na jej bazie jest pozbawiona sensu.
regression_columns <- setdiff(columns, c("Battery.Formula", "Formula.Charge", "Formula.Discharge", "Battery.ID", "Gravimetric.Capacity"))
source <- data[regression_columns]
idx <- createDataPartition(y = source$Volumetric.Capacity, p = 0.7, list = FALSE)
train <- source[idx,]
test <- source[-idx,]
ctrl <- trainControl(method = "repeatedcv",
number = 2, repeats = 5)
fit <- train(Volumetric.Capacity ~ .,
data = train,
method = "rf",
trControl = ctrl,
importance = TRUE,
ntree = 10)
rfClasses <- predict(fit, newdata = test)
kable(fit$results, caption="Tabela wyników z podzbioru uczącego")
| mtry | RMSE | Rsquared | MAE | RMSESD | RsquaredSD | MAESD |
|---|---|---|---|---|---|---|
| 2 | 269.8344 | 0.8057482 | 135.34697 | 19.47533 | 0.0436412 | 7.129146 |
| 10 | 182.1383 | 0.8991943 | 62.81230 | 19.41171 | 0.0216480 | 3.817609 |
| 19 | 169.5374 | 0.9109742 | 47.90967 | 17.93144 | 0.0193543 | 3.153736 |
res_pred <- postResample(pred = rfClasses, obs = test$Volumetric.Capacity)
kable(res_pred[1:2])
| x | |
|---|---|
| RMSE | 144.7018364 |
| Rsquared | 0.9362563 |
ggplotly(
ggplot() +
geom_histogram(
aes(x = abs(test$Volumetric.Capacity - rfClasses)),
binwidth = 5,
fill = "green",
alpha = 0.7
) +
labs(
title = paste("Rozkład błędu predykcji wartości Volumetric.Capacity"),
x = "Wartość bezwzględna błędu",
y = "Liczba przypadków"
) +
theme_minimal()
)
Predykcję wartości Volumetric.Capacity dla nowych baterii przeprowadzono na trzech przykładach. Zostały one przestawaione w poniższej tabeli:
new_materials <- data.frame(
Working.Ion = c("Li", "Li", "Ca"),
Max.Delta.Volume = c(3, 0.1, 0.5),
Average.Voltage = c(0.1, -0.2, 0.4),
Gravimetric.Energy = c(200, 50, 100),
Volumetric.Energy = c(600, 150, 300),
Atomic.Fraction.Charge = c(0, 0, 0.5),
Atomic.Fraction.Discharge = c(0.75, 0.8, 0.9),
Stability.Charge = c(0, 0.1, 0.05),
Stability.Discharge = c(0.01, 0.05, 0.1),
Steps = c(1, 1, 1),
Max.Voltage.Step = c(0, 0, 0)
)
predicted <- predict(fit, newdata = new_materials)
new_materials$Predicted.Volumetric.Capacity = predicted
kable(new_materials)
| Working.Ion | Max.Delta.Volume | Average.Voltage | Gravimetric.Energy | Volumetric.Energy | Atomic.Fraction.Charge | Atomic.Fraction.Discharge | Stability.Charge | Stability.Discharge | Steps | Max.Voltage.Step | Predicted.Volumetric.Capacity |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Li | 3.0 | 0.1 | 200 | 600 | 0.0 | 0.75 | 0.00 | 0.01 | 1 | 0 | 4702.761 |
| Li | 0.1 | -0.2 | 50 | 150 | 0.0 | 0.80 | 0.10 | 0.05 | 1 | 0 | 1397.106 |
| Ca | 0.5 | 0.4 | 100 | 300 | 0.5 | 0.90 | 0.05 | 0.10 | 1 | 0 | 2433.794 |